'''


exec(''.join(open("/workERS/daltonm/corona/ppp/codefiles/CRppppt2.py", encoding="utf8").readlines()[:]))
nohup python3 /workERS/daltonm/corona/ppp/codefiles/CRppppt2.py  | tee &
'''


import sys
import pandas as pd
import numpy as np
import matplotlib as mpl


filename = 'CRppppt2'

dataloc=  "/dataERS/eract/daltonm/"
resultsloc="/workERS/daltonm/corona/ppp/results"


import os

sys.path.append('/workERS/daltonm/BG')
#try:
from CRmergeBGLDBv6func import *

sys.path.append('/workERS/daltonm/corona')
from basicfunctions import *

resultsloc1 = "/home/daltonm/ppp/"

from datetime import date
datestr = date.today().strftime(format="%Y%m%d")
resultsloc = resultsloc1 + datestr + '/'
if not os.path.exists(resultsloc):
    os.makedirs(resultsloc)


from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct


import re

import time
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


'''
coming from geocoding_user_guide setn by wen luo
'''

'''
['Address', 'BusinessName', 'BusinessType', 'CD', 'City', 'DateApproved', 'Gender', 'JobsReported', 'Lender', 'LoanAmount', 'NAICSCode', 'Name of File', 'NonProfit', 'RaceEthnicity', 'State', 'Veteran', 'Zip', 'st', 'ID']

'''

# try to remove some strings from end of string
removelists =['llc', 'corporation', 'incorporated',
                         'limited', 'llp', 'pllc', 'company',
                         'inc', 'ltd',
                         'co', 'lp', 'comp',  'partnerships', 'partners',
                         'part', 'associates', 'assoc', 'corp', 'intl']

# removelists2 = '|'.join(['\!', '\@', '\#', '\%', '\^', '\&', '\*', '\(', '\)', '\,', '\.', '\[', '\]', '\'',
#                          '\-', '^the ', '\;', ' the$', '\"', '\/', '\+'])

removelists2='|'.join(['l l p', 'l l c', 'l t d','^the ',' the$'])

# these are coming from a subset of the intersection of most common words between lists
# 'american','company'
removelists3= ['n', 'of', 'services', 'group', 'service', 'usa', 'international', 'america',
                'llc', 'llp', 'ltd','and','us']

# removes if city name is at end of employer name
removecitylist = ['city', 'cdp', 'town', 'village', 'borough']

import re


def cityedit(x, removecitylist=removecitylist):
    x = str(x).lower().translate(str.maketrans('', '', string.punctuation))
    for i in removecitylist:
        if x.endswith(i):
            x = x.replace(i, '')
    if x.startswith('st '):
        x = x.replace('st ', 'saint ')
    x = x.strip().replace('^st ','saint ').replace(' st ', ' saint ').strip()
    return(x)

def employeredit(x,removelists=removelists,removelists2=removelists2,removelists3=removelists3):
    '''
    :param x:
    :param removelists: business suffixes
    :param removelists2: awkward words
    :param removelists3: remove certain words
    :return:
    '''
    y1 = str(x[0]).lower().translate(str.maketrans('', '', string.punctuation))
    # gets rid of some awkward letter combos
    y1 = re.sub(removelists2, '', y1).split(' ')
    y1 = [word for word in y1 if word not in removelists3]
    try:
        if y1[-1] in removelists:
            del y1[-1]
    except:
        ''
    try:
        if y1[-1] in removelists:
            del y1[-1]
    except:
        ''
    # remove lowered city name if at end of employer string
    city = str(x[1])
    try:
        if y1[-1] == city:
            del y1[-1]
    except:
        ''
    '''
    replace full state name with abbreviation
    lastly, if for some reason letters are split up, close them
    '''
    y = ' '.join(y1)
    try:
        state = stateinfo(x[2], 'name').lower()
        st = stateinfo(x[2], 'abbr').lower()
        y = re.sub(state, st, y)
        # replace if space between state abbrev
        y = re.sub(' ' + st[0] + ' ' + st[1] + ' ', ' ' + st + ' ', y)
    except:
        ''
    '''
    remove "united states"
    '''
    y = re.sub('united states', '', y).strip()
    y = y.replace('  ', ' ')
    #nospaces
    z = re.sub(r"\s+", "", y, flags=re.UNICODE)
    return(y, z)



#
def latcoord(x):
    try:
        if x=='':
            return(np.NaN)
        elif ((str(x)[-1]=='N') |(str(x)[-1]=='E')):
                return(float(str(x)[:-1]))
        elif ((str(x)[-1]=='S') |(str(x)[-1]=='W')):
                return(-float(str(x)[:-1]))
    except:
        return(np.NaN)




def allpppedits():
    dfp = pd.read_csv(dataloc + 'pppfiles/all_ppp.psv', sep = '|')
    #only keep if not in following 2 categoris
    # kcond = (dfp['BusinessType'].isin([ 'Self-Employed Individuals',
    #                                'Independent Contractors', 'Sole Proprietorship']))
    # dfp = dfp[~kcond]
    #keeping original name for comparison purposes
    dfp['employer_orig'] = dfp['BorrowerName'].copy()
    dfp['fipsstate'] = dfp['st'].apply(lambda x: stateinfo(x, 'fips'))
    #dfp['employer_first'] = dfp[['stateabbr', 'employer']].apply(empfirst1,axis=1)
    dfp['naics2dig'] = pd.Categorical(dfp['NAICSCode'].astype('str').str[:2])
    #get address info
    #prelim info for reading in
    dft1 = pd.read_excel(geog + 'geocodeoutput.xlsx', engine='openpyxl')
    dft1.dropna(inplace=True)
    widths = dft1['widths'].values.tolist()
    cols = dft1['names'].values.tolist()
    widths = [int(i) for i in widths]
    cols = [i.strip() for i in cols]
    #reading in actual data
    dft2 = pd.read_fwf(dataloc + 'pppfiles/AutoGeo_ppp.txt',
                       error_bad_lines=False,
                       widths=widths, names=cols)
    #keepcols = ['All the data from the Input record, unchanged.','County FIPS Code','Latitude','Longitude','FIPS State Code','Geocode Type']
    #dft2 = dft2[keepcols]
    dft2['ID'] = dft2['All the data from the Input record, unchanged.'].apply(lambda x: str(x)[:20].strip())
    dft2['fipscounty'] = dft2['County FIPS Code'].apply(lambda x: float(x) if x!=np.NaN else np.NaN)
    dft2.loc[:,'Latitude'] = dft2['Latitude'].apply(latcoord)
    dft2.loc[:, 'Longitude'] = dft2['Longitude'].apply(latcoord)
    '''
    for some reason there is an issue where a handful of zip codes have one outlier (in totally different state)
    - there might be outliers within state too
    - zip code - coordinates match up well
    - but zip code - county *do not*
    dealing with this
    1) identify where lat / long are more than BUFFER away from median for county
    --- null those coordinates, and create Dzip_problem
    '''
    dft2['county_temp'] = np.where(dft2['County FIPS Code'].isnull(),9999,dft2['County FIPS Code'])
    # dft2['lat_95'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Latitude'].transform(q95)
    # dft2['long_95'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Longitude'].transform(q95)
    # dft2['lat_05'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Latitude'].transform(q05)
    # dft2['long_05'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Longitude'].transform(q05)
    dft2['lat_med'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Latitude'].transform('median')
    dft2['long_med'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Longitude'].transform('median')
    '''
    buffer is determined by following: 
    1) largest county in US (by area) is San Bernodino, which is 20,000 sq miles
    2) that is 141 x 141 miles
    3) midpoint would be 70.5
    4) 1 degree of latitude/ longitude is approximately 69 miles (does not vary much) / 53 miles (does vary)
    5) 70.5 divided by typical milesage for 1 degree
    '''
    latbuffer = 70.5 / 69
    longbuffer = 70.5 / 53
    #buffer away from median
    cond1 = (dft2['Latitude']>dft2['lat_med'] + latbuffer)
    cond2 = (dft2['Latitude']<dft2['lat_med'] - latbuffer)
    cond3 = (dft2['Longitude']>dft2['long_med'] + longbuffer)
    cond4 = (dft2['Longitude']<dft2['long_med'] - longbuffer)
    condb = (cond1 | cond2 | cond3 | cond4)
    #only a problem if don't have a good StreetLevel match
    condc = (dft2['Geocode Type'] != 'StreetLevel')
    dft2['Dzip_problem'] = np.where( condb & condc,1,0)
    cond = (dft2['Dzip_problem'] ==1)
    dft2['Latitude'] = np.where(cond,dft2['lat_med'],dft2['Latitude'])
    dft2['Longitude'] = np.where(cond,dft2['long_med'],dft2['Longitude'])
    dft2['addr_edit'] = dft2[['House Number', 'Street Name', 'Street type']].apply(lambda x: (str(x[0]).lower().strip() + ' ' + str(x[1]).replace(str(x[2]),'').translate(str.maketrans('', '', string.punctuation)).lower()).strip(),axis=1)
    cond = ((dft2['House Number'].isnull()) | (dft2['Street Name'].isnull()))
    dft2['addr_edit'] = np.where(cond, '', dft2['addr_edit'])
    dft2['city'] = dft2['City, standardized'].apply(lambda x: cityedit(str(x)))
    keepcols = ['ID','fipscounty','Latitude','Longitude','Dzip_problem', 'addr_edit', 'city']
    '''
    check
    '''
    #how many have city? street address?
    print(len(dft2))
    print(dft2['Geocode Type'].value_counts())
    print(dft2['Dzip_problem'].value_counts())
    print(dft2['House Number'].isnull().sum())
    print((dft2['Street Name'] == '').sum())
    samp(dft2[dft2['Dzip_problem']==1])
    #merge in
    mc = ['ID']
    dfp['ID'] = pd.to_numeric(dfp['ID'], errors='coerce')
    dft2['ID'] = pd.to_numeric(dft2['ID'], errors='coerce')
    dfp = merging(dfp,dft2[keepcols],mc, mc,True,'left')
    dft2 = []
    #employer name
    # since LDB only has first 35 characters, keeping first 45 with the hope this makes for better matches
    dfp[['employer_edit','employer']] = dfp[['employer_orig', 'city', 'fipsstate']].apply(lambda x: employeredit(x), axis=1, result_type='expand')
    #save to CZ
    #for year in yearlist[:]:
    dfg = pd.read_excel(dataloc + 'geography/' + 'commuting_zones.xls')
    dfg['fipscounty'] = dfg['FIPS'].apply(lambda x: float(str(x)[-3:]))
    dfg['fipsstate'] = dfg['FIPS'].apply(lambda x: float(str(x)[:-3]))
    rdict = {
        'Commuting Zone ID, 2000' : 'cz',
    }
    mc = ['fipsstate', 'fipscounty']
    kcols = mc + ['cz']
    dfg = dfg.rename(columns = rdict)[kcols].drop_duplicates(subset = mc)
    dfp = dfp.merge(dfg[kcols], on=mc, how='left', indicator=True)
    ####filling in cz if missing
    ##get most common in state
    dft = dict(dfp.groupby('State')['cz'].agg(pd.Series.mode))
    dfp['st_cz'] = dfp['State'].map(dft)
    dfp['cz'] = pd.to_numeric(dfp['cz'].fillna(dfp['st_cz']))
    czlist = [i for i in dfp['cz'].unique() if str(i)!='nan']
    for cz in czlist:
        cond = (dfp['cz'] == cz)
        cz = str(int(cz))
        dfp[cond].to_csv(dataloc + 'pppfiles/cz/' + 'ppp_' + cz + '.psv', sep = '|')
    return



def alleidledits():
    dfp = opensplitnew('all_eidl', [])
    #make simple dummy to separate
    cond = (dfp['AWARDDESC'] == 'Economic Injury Disaster Grant')
    dfp['Deidltype'] = np.where(cond, 'grant', 'loan')
    # #only keep if not in following 2 categoris
    # kcond = (dfp['BusinessType'].isin([ 'Self-Employed Individuals',
    #                                'Independent Contractors', 'Sole Proprietorship']))
    # dfp = dfp[~kcond]
    dfp['fipsstate'] = dfp['LEGALENTITYSTATECD'].apply(lambda x: stateinfo(x, 'fips'))
    #employer name
    # there is a "DBA" for a lot of these. need to split it up
    dfp['employer_orig'] = dfp['AWARDEEORRECIPIENTLEGALENTITYNAMEANDDOINGBUSINESSAS'].apply(lambda x: str(x).lower().split(' dba ')[0])
    cond = (dfp['Deidltype'] == 'loan')
    dfp['employer_orig'] = np.where(cond,dfp['AWARDEEORRECIPIENTLEGALENTITYNAME'] , dfp['employer_orig'])
    #drop columns
    dcols = ['AWARDEEORRECIPIENTLEGALENTITYNAMEANDDOINGBUSINESSAS', 'AWARDEEORRECIPIENTLEGALENTITYNAME',
       'AWARDINGAGENCYCD', 'AWARDINGOFFICECD', 'AWARDINGSUBTIERAGENCYCD',
       'BUSINESSFUNDSINDICATOR',
       'CFDA_NUM', 'FUNDINGAGENCYCD', 'FUNDINGOFFICECD',
       'FUNDINGSUBTIERAGENCYCD', 'LEGALENTITYADDRLINE1', 'LEGALENTITYCITYNAME',
       'LEGALENTITYCONGRESSIONALDISTRICT', 'LEGALENTITYCOUNTRYCD',
       'LEGALENTITYSTATECD', 'LEGALENTITYZIP5', 'LEGALENTITYZIPLAST4',
       'PERIODOFPERFORMANCECURRENTENDDATE',
       'PRIMPLACEOFPERFORMANCECD',
       'PRIMPLACEOFPERFORMANCECONGRESSIONALDISTRICT',
       'PRIMPLACEOFPERFORMANCECOUNTRYCD', 'RECORDTYPE']
    dfp.drop(columns = dcols, inplace=True)
    #get address info
    #prelim info for reading in
    dft1 = pd.read_excel(geog + 'geocodeoutput.xlsx', engine='openpyxl')
    dft1.dropna(inplace=True)
    widths = dft1['widths'].values.tolist()
    cols = dft1['names'].values.tolist()
    widths = [int(i) for i in widths]
    cols = [i.strip() for i in cols]
    #reading in actual data
    dft2 = pd.read_fwf(dataloc + 'AutoGeo_eidl.txt',
                       error_bad_lines=False,
                       widths=widths, names=cols)
    #keepcols = ['All the data from the Input record, unchanged.','County FIPS Code','Latitude','Longitude','FIPS State Code','Geocode Type']
    #dft2 = dft2[keepcols]
    dft2['ID'] = dft2['All the data from the Input record, unchanged.'].apply(lambda x: str(x)[:20].strip())
    dft2['fipscounty'] = pd.to_numeric(dft2['County FIPS Code'],errors='coerce').apply(lambda x: float(x) if x!=np.NaN else np.NaN)
    dft2.loc[:,'Latitude'] = dft2['Latitude'].apply(latcoord)
    dft2.loc[:, 'Longitude'] = dft2['Longitude'].apply(latcoord)
    '''
    for some reason there is an issue where a handful of zip codes have one outlier (in totally different state)
    - there might be outliers within state too
    - zip code - coordinates match up well
    - but zip code - county *do not*
    dealing with this
    1) identify where lat / long are more than BUFFER away from median for county
    --- null those coordinates, and create Dzip_problem
    '''
    dft2['county_temp'] = np.where(dft2['County FIPS Code'].isnull(),9999,dft2['County FIPS Code'])
    # dft2['lat_95'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Latitude'].transform(q95)
    # dft2['long_95'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Longitude'].transform(q95)
    # dft2['lat_05'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Latitude'].transform(q05)
    # dft2['long_05'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Longitude'].transform(q05)
    dft2['lat_med'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Latitude'].transform('median')
    dft2['long_med'] = dft2.groupby(['FIPS State Code', 'county_temp'])['Longitude'].transform('median')
    '''
    buffer is determined by following: 
    1) largest county in US (by area) is San Bernodino, which is 20,000 sq miles
    2) that is 141 x 141 miles
    3) midpoint would be 70.5
    4) 1 degree of latitude/ longitude is approximately 69 miles (does not vary much) / 53 miles (does vary)
    5) 70.5 divided by typical milesage for 1 degree
    '''
    latbuffer = 70.5 / 69
    longbuffer = 70.5 / 53
    #buffer away from median
    cond1 = (dft2['Latitude']>dft2['lat_med'] + latbuffer)
    cond2 = (dft2['Latitude']<dft2['lat_med'] - latbuffer)
    cond3 = (dft2['Longitude']>dft2['long_med'] + longbuffer)
    cond4 = (dft2['Longitude']<dft2['long_med'] - longbuffer)
    condb = (cond1 | cond2 | cond3 | cond4)
    #only a problem if don't have a good StreetLevel match
    condc = (dft2['Geocode Type'] != 'StreetLevel')
    dft2['Dzip_problem'] = np.where( condb & condc,1,0)
    cond = (dft2['Dzip_problem'] ==1)
    dft2['Latitude'] = np.where(cond,dft2['lat_med'],dft2['Latitude'])
    dft2['Longitude'] = np.where(cond,dft2['long_med'],dft2['Longitude'])
    dft2['addr_edit'] = dft2[['House Number', 'Street Name', 'Street type']].apply(lambda x: (str(x[0]).lower().strip() + ' ' + str(x[1]).replace(str(x[2]),'').translate(str.maketrans('', '', string.punctuation)).lower()).strip(),axis=1)
    cond = ((dft2['House Number'].isnull()) | (dft2['Street Name'].isnull()))
    dft2['addr_edit'] = np.where(cond, '', dft2['addr_edit'])
    dft2['city'] = dft2['City, standardized'].apply(lambda x: cityedit(str(x)))
    keepcols = ['ID','fipscounty','Latitude','Longitude','Dzip_problem', 'addr_edit', 'city']
    '''
    check
    '''
    #how many have city? street address?
    print(len(dft2))
    print(dft2['Geocode Type'].value_counts())
    print(dft2['Dzip_problem'].value_counts())
    print(dft2['House Number'].isnull().sum())
    print((dft2['Street Name'] == '').sum())
    samp(dft2[dft2['Dzip_problem']==1])
    #merge in
    mc = ['ID']
    dfp = merging(dfp,dft2[keepcols],mc, mc,True,'left')
    dft2 = []
    dfp[['employer_edit','employer']] = dfp[['employer_orig', 'city', 'fipsstate']].apply(lambda x: employeredit(x), axis=1, result_type='expand')
    #final steps
    stateabbrev = ['CA', 'DE', 'NY','ND', 'VA','DC', 'FL', 'WV','MI', 'AL', 'NE', 'AK', 'NV', 'AZ', 'NH', 'AR', 'NJ', 'NM', 'CO', 'CT', 'NC',
                      'OH', 'OK', 'GA', 'OR', 'HI', 'PA', 'ID', 'IL', 'RI', 'IN', 'SC', 'IA', 'SD',
                   'KS', 'TN', 'KY', 'TX', 'LA', 'UT', 'ME', 'VT', 'MD', 'MA', 'WA', 'MN',  'MS', 'WI', 'MO',
                   'WY', 'MT']
    # make empty files
    for st in stateabbrev[:]:
        cond = (dfp['fipsstate'] == stateinfo(st, 'fips'))
        for suff in ['grant', 'loan']:
            cond2 = (dfp['Deidltype'] == suff)
            store = pd.HDFStore(dataloc + 'pppfiles/' + 'eidl'+suff+'_' + st + '.h5')
            try:
                store.put(st, dfp[cond & cond2], format='table', data_columns=True)
            except:
                store.put(st, dfp[cond & cond2], data_columns=True)
            store.close()
    #save
    #savesplit(dfp, "ppporig")
    return ()


def ldbedits():
    # get ldb data
    #2019 only has state_comb, not fips
    #this COULD be right, but it could also be coming from address file
    #this is a problem for ui_acct
    #as a result, to avoid confusion, just dropping all of it
    #if i need it, i'll have to merge it in
    kcols = ['ldb_num',  'ein', 'naics_code',
       'trade_name', 'legal_name', 'aaemp', 'owner_code',
      'naics2dig1',
       'trade_edit', 'legal_edit',
        'employer_first', 'Number of Estab in EIN', ]
    dfldb = pd.concat([opensplitnew('ldb/' + str(yr) + 'ldbv1', kcols) for yr in [2019,2020]]).drop_duplicates().rename(columns = {'naics2dig1' : 'naics2dig'})
    dfldb['employer'] = dfldb['trade_edit'].fillna( dfldb['legal_edit'])
    cond = (dfldb['owner_code'] == 5)
    dfldb = dfldb[cond]
    #employer name
    dfldb['ldb_num'] = pd.to_numeric(dfldb['ldb_num'], errors='coerce')
    ###only keep one ldb number
    dfldb.drop_duplicates('ldb_num', inplace=True)
    #get address info
    #address info for reading in
    dfaddr = ldbaddresses(2020,2020)
    #order of address value
    cond1 = (dfaddr['addr_type'] == 'ph') & (dfaddr['Geocode Type'] != 'NotGeocoded')
    cond2 = (dfaddr['addr_type'] == 'ui') & (dfaddr['Geocode Type'] != 'NotGeocoded')
    cond3 = (dfaddr['addr_type'] == 'ot') & (dfaddr['Geocode Type'] != 'NotGeocoded')
    dfaddr['addr_order'] = np.where(cond1, 1, np.where(cond2, 2, np.where(cond3, 3, np.NaN)))
    dfaddr['addr_order_group'] = dfaddr.groupby('ldb_num')['addr_order'].transform('min')
    #get rid of addresses not geocoded if LDB has at least one geocoded
    kcond = ~((dfaddr['addr_order_group'].notnull()) & (dfaddr['Geocode Type'] == 'NotGeocoded'))
    dfaddr = dfaddr[kcond]
    #merge in
    dfaddr['ldb_num'] = pd.to_numeric(dfaddr['ldb_num'], errors='coerce')
    addcols = ['ldb_num', 'addr_edit', 'fipscounty', 'city', 'st', 'zipcode5',
       'Latitude', 'Longitude', 'Geocode Type', 'addr_order']

    mc = ['ldb_num']
    dfldb = merging(dfldb, dfaddr[addcols],mc, mc,True,'left')
    dfaddr = None
    '''
    added on 8/25 - goal is to fill in city/county
    '''
    rdict = {'cnty': 'fipscounty'}
    dfaddr = pd.read_csv(dataloc + 'ldb/ldbaddr/ldbaddressesv1_' + str(2020) + '.psv', sep='|')[
        ['ldb_num', 'city', 'cnty', 'fipsstate']].rename(columns=rdict)
    dfaddr.loc[:, 'city'] = dfaddr['city'].apply(
        lambda x: str(x).lower().strip().replace('|'.join(i+ '$' for i in removecitylist ), '').replace(
            '[.]', '').strip().replace('^st ', 'saint ').replace(' st ', ' saint ').strip())
    dfaddr['st'] = dfaddr['fipsstate'].apply(lambda x: stateinfo(x, 'abbr'))
    dfldb = merging(dfldb, dfaddr, mc, mc, True, 'left')
    dfaddr = None
    for c in ['city', 'st', 'fipscounty']:
        dfldb[c] = dfldb[c + '_x'].fillna(dfldb[c + '_y'])
        dfldb.drop(columns=[c + '_x', c + '_y'], inplace=True)
    dfldb['fipsstate'] = dfldb['st'].apply(lambda x: stateinfo(x, 'fips'))
    for c in ['trade_edit', 'legal_edit']:
        dfldb[[c + '_spaces', c]] = dfldb[[c, 'city', 'fipsstate']].apply(lambda x: employeredit(x), axis=1,
                                                                          result_type='expand')
    ###the address file clearly messed up and doesn't have all respondents. this needs to be looked at.
    ### in the meantime, pulling in county / long-lat coordinates from dynamic files to round out those missing
    renamedict = {'fipsstate' : 'fipsstate_ldb', 'CNTY' : 'fipscounty_ldb', 'ldb_long' : 'Longitude_ldb', 'ldb_lat' : 'Latitude_ldb'}
    dft = pd.read_csv(dataloc + "ldbv3location_2019.zip").rename(columns = renamedict)
    mc = ['ldb_num']
    dfldb = merging(dfldb, dft, mc, mc, True, 'left')
    dfldb['fipsstate'] = dfldb['st'].apply(lambda x: stateinfo(x, 'fips'))
    for c in ['fipsstate', 'fipscounty', 'Latitude', 'Longitude'
              ]:
        dfldb[c] = dfldb[c].fillna(dfldb[c + '_ldb'])
    dfldb = dfldb.drop_duplicates(['ldb_num','addr_edit','fipsstate','fipscounty','city']).drop(columns = ['Unnamed: 0', 'fipsstate_ldb', 'fipscounty_ldb', 'Longitude_ldb', 'Latitude_ldb'])
    #final steps
    dfldb['st'] = dfldb['fipsstate'].apply(lambda x: stateinfo(x, 'abbr'))
    dfg = pd.read_excel(dataloc + 'geography/' + 'commuting_zones.xls')
    dfg['fipscounty'] = dfg['FIPS'].apply(lambda x: float(str(x)[-3:]))
    dfg['fipsstate'] = dfg['FIPS'].apply(lambda x: float(str(x)[:-3]))
    rdict = {
        'Commuting Zone ID, 2000' : 'cz',
    }
    mc = ['fipsstate', 'fipscounty']
    kcols = mc + ['cz']
    dfg = dfg.rename(columns = rdict)[kcols].drop_duplicates(subset = mc)
    dfldb = dfldb.merge(dfg[kcols], on=mc, how='left', indicator=True)
    # make empty files
    dft = dict(dfldb.groupby('fipsstate')['cz'].agg(pd.Series.mode))
    dfldb['st_cz'] = dfldb['fipsstate'].map(dft)
    dfldb['cz'] = pd.to_numeric(dfldb['cz'].fillna(dfldb['st_cz']))
    czlist = [i for i in dfldb['cz'].unique() if str(i)!='nan']
    kcols = ['ldb_num', 'ein', 'naics_code', 'trade_name', 'legal_name', 'aaemp',
       'owner_code', 'naics2dig', 'trade_edit', 'legal_edit', 'employer_first',
       'Number of Estab in EIN', 'employer', 'addr_edit', 'fipscounty', 'city',
       'st', 'zipcode5', 'Latitude', 'Longitude', 'Geocode Type', 'addr_order',
       'fipsstate', 'trade_edit_spaces', 'legal_edit_spaces', 'cz',
       ]
    dfldb = dfldb[kcols]
    for cz in czlist:
        cond = (dfldb['cz'] == cz)
        cz = str(int(cz))
        dfldb[cond].to_csv(dataloc + 'pppfiles/ldb/cz/' + 'ldb_' + cz + '.psv', sep='|')

    return


fuzzycols = ['ldb_employer_fuzzy_a', 'ldb_cosine_fuzzy_a', 'Dindustry_fuzzy_a', 'ldb_employer_fuzzy_b',
                 'ldb_cosine_fuzzy_b', 'Dindustry_fuzzy_b','ldb_employer_fuzzy_c',
                 'ldb_cosine_fuzzy_c', 'Dindustry_fuzzy_c', 'employer']

def employerfuzzy(x):
    name = ''
    group = ''
    dindustry = ''
    fuzztype='No Match'
    fuzzycutoff=50
    tempratio=0
    #9 because mergedict is 3 long (x 3 variables)
    numberlist = [n for n in range(9) if n % 3 == 1]
    #if there's an exact match in any group, keep it
    for i in numberlist:
        if (x[i]==1) & (group==''):
            if name=='':
                name=x[i-1]
                group=fuzzycols[i][-1]
                fuzztype='Exact Match'
                dindustry = x[i + 1]
    #if there's a fuzzy match in city/county, then use it
    if name=='':
        for i in numberlist[:2]:
            #if in first two groups (county AND naics match)
            if (x[i] < 1)&(x[i]>0):
                if name=='':
                    name=x[i-1]
                    fuzztype = 'Fuzzy Match'
    #cycle through remaining fuzzy matches and all first word matches , and choose one with highest fuzzywuzzy match
    if name=='':
        choices = []
        for i in numberlist:
            #if employer name is a substring of fuzzy name, keep
            if str(x[-1]) in str(x[i - 1]):
                if name=='':
                    name = str(x[i - 1])
            else:
                choices.append(str(x[i - 1]))
    #
    if name=='':
        if choices:
            #fuzzy wuzzy match, choose best matches if they make fuzzycutoff
            results=process.extract(str(x[- 1]),list(set(choices)),scorer=fuzz.token_sort_ratio)
            #don't want the 100s if there is an alternative since these are substrings of fuzzy of emp
            results2 = [r for r in results if r[1]!=100]
            if results2:
                if results2[0][1]>fuzzycutoff:
                    name=results2[0][0]
                    tempratio = results[0][1]
            #otherwise, use the first one that gives 100 match
            elif results[0][1]>fuzzycutoff:
                name=results[0][0]
                tempratio = results[0][1]
    for i in numberlist:
        if (x[i-1]==name) & (group==''):
            group = fuzzycols[i][-1]
            dindustry = x[i + 1]
            if (x[i]==-1):
                fuzztype = 'First Word Match'
            else:
                fuzztype = 'Fuzzy Match'
    return name,group,fuzztype,tempratio,dindustry



def stateloopfull(dfp,dfldb,cz):
    '''


    'employer' is the name of establishment
    'city' is the name of the city in the address
    'fipsstate' is the FIPS state 2-digit number
    fipscounty is the 3-digit FIPS county number
    naics2dig1 is the most common 2-digit industry assigned for a particular establishment / location by Burning Glass
    naics2dig2 is the 2nd most common 2-digit industry assigned for a particular establishment / location by Burning Glass

    :return:
    '''
    '''
    just ensuring both merging variables are numeric and consistent
    '''
    for c in ['fipsstate', 'fipscounty']:
        # if not np.issubdtype(dfp[c],np.number):
        dfp.loc[:, c] = dfp[c].apply(getint)
        # if not np.issubdtype(dfldb[c],np.number):
        dfldb.loc[:, c] = dfldb[c].apply(getint)
    '''
    cosine similarity
    https://bergvca.github.io/2017/10/14/super-fast-string-matching.html
    ########these cutoffs are not actually being used right now
    cutoffs are kind of arbitrary, but they represent an increasingly restrictive criteria for 
    accepting a fuzzy match
    first cutoff is for if there's a city match 
    second cutoff is for if there is a county match
    last cutoff is for if there is only a state match
    '''
    cosinecutoff1 = .50
    cosinecutoff2 = .55
    cosinecutoff3 = .60
    cosinecutoff4 = .70
    '''
    3 different cycles of match based on ... 
    1) city
    2) county 
    3) state 
    '''
    mergedict = {'a': [['fipsstate', 'city'], cosinecutoff1],
                 'b': [['fipsstate', 'fipscounty'], cosinecutoff2],
                 'c': [['fipsstate', 'cz'], cosinecutoff3],
                 'd': [['cz'], cosinecutoff4],
                 }
    '''
    6/3/22
    creating values for first word
    tlist0 gets no value for first word match
    tlist1 gets .05
    tlist2 gets .10
    '''
    tdict = dfp.drop_duplicates(subset='employer')['employer_first'].value_counts()

    tlist0 = tdict[tdict/len(tdict) > .001].keys()
    tlist1 = tdict[(tdict/len(tdict) > .0002) & (tdict/len(tdict) <= .001)].keys()
    tlist2 = tdict[ (tdict/len(tdict) <= .0002)].keys()
    tdict = None
    firstwordvals = [tlist0,tlist1,tlist2]


    if len(dfp)>0:
        '''
        #this vectorizer is used to turn ngrams for each employer name
        into sparse vector
        --- these are going to be very long vectors composed of 0s and 1s (very few 1s for matching ngrams)
        the ngram function is flexible. currently set at ngrams of length 3
        '''
        vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
        '''
        starting point for key variables

        merge_fuzzy is an indicator for removing an observation if we find a sufficient match in the loop
        --- the goal of this is just computational efficiency
        ldb_employer_fuzzy gets replaced with the employer name that is a sufficient match
        ldb_cosine_fuzzy gets replaced with the cosine similarity value (0-1) for a sufficient match. 
        If there is an exact match, this is filled in with a 1
        '''
        dfp.loc[:, 'merge_fuzzy'] = 'No'
        dfp.loc[:, 'ldb_employer_fuzzy'] = ''
        dfp.loc[:, 'ldb_cosine_fuzzy'] = ''
        dfp.loc[:, 'Dindustry_fuzzy'] = ''
        '''
        4/15/22
        immeidately going to state level and doing matching on that level
        '''
        j = 'd'
        ind = cz
        dfbgt2 = cossimloop4ppp(dfbg=dfp.copy(), dfldb=dfldb.copy(), vectorizer=vectorizer,
                             firstwordvals=firstwordvals)
        #merge matched back to BG, keep if matched

        bcols = ['employer', 'city', 'fipscounty','cz', 'fipsstate']
        dfbgt2 = dfbgt2.merge(dfp[bcols].drop_duplicates(subset=bcols), on=bcols,
                              how='outer')
        dfbgt2['fuzzy_group'] = dfbgt2['fuzzy_group'].fillna('No Match')


    return(dfbgt2)



def paringdown(dft, dfp, dfldb):
    tlist = dft['ID'].unique().tolist()
    dropcond = (dfp['ID'].isin(tlist))
    dfp = dfp[~dropcond]
    #ldb if naics2 is 72 or franchise, or ein if not
    tlist = dft['ldb_num'].unique().tolist()
    dropcond = (dfldb['ldb_num'].isin(tlist)) & ((dfldb['naics2dig'] == '72') | (dfldb['Dfranchise'] == 1))
    tlistein = dft['ein'].unique().tolist()
    dropcondein = (dfldb['ein'].isin(tlistein)) & (dfldb['naics2dig'] != '72') & (dfldb['Dfranchise'] == 0)
    dfldb = dfldb[~(dropcond | dropcondein)]
    return(dfp,dfldb)


def paringdown_eidl(dft, dfp, dfldb):
    tlist = dft['ID'].unique().tolist()
    dropcond = (dfp['ID'].isin(tlist))
    dfp = dfp[~dropcond]
    #ldb if naics2 is 72, or ein if not
    tlist = dft['ldb_num'].unique().tolist()
    dropcond = (dfldb['ldb_num'].isin(tlist))
    tlistein = dft['ein'].unique().tolist()
    dropcondein = (dfldb['ein'].isin(tlistein)) & (dfldb['Dfranchise'] == 0)
    dfldb = dfldb[~(dropcond | dropcondein)]
    return(dfp,dfldb)


def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3


def exactaddress(dfp, dfldb, pppkeepcols):
    ldbcols = ['Number of Estab in EIN', 'aaemp',
           'ein',
           #'first_nonzero_emp', 'init_liability', 'payperworker', 'prevaaemp','prevtot_wages_yr', 'run_num',  'tot_wages_yr','ui_acct',
           'naics_code', 'owner_code',
           ]
    renamedict = {
        'naics2dig' : 'naics2dig_ldb'
    }
    df = []
    for v in ['trade_edit', 'legal_edit']:
        dfp.rename(columns={'employer' : v }, inplace=True)
        mc = [v, 'city', 'fipsstate', 'addr_edit']
        dft = merging(dfldb[ldbcols + mc + ['ldb_num']].rename(columns= renamedict), dfp[pppkeepcols + mc], mc, mc, True, 'inner')
        dft['employer_fuzzy'] = dft[v].copy()
        dft['ldb_cosine_fuzzy'] = 1
        dft['Dindustry_fuzzy'] = ''
        dft['fuzzy_type'] = 'Exact Address Match'
        dft['fuzzy_group'] = 'a'
        df.append(dft.copy())
        ###keep ppp if not matched
        ##for now, not deduping LDB since it could in theory match to more than one PPP
        dfp, dfldb = paringdown(dft, dfp, dfldb.copy())
        dfp.rename(columns={ v : 'employer'}, inplace=True)
    return(df, dfp, dfldb)

def addressmatch(df, dfp, dfldb):
    ldbcols = ['Number of Estab in EIN', 'aaemp',
           'ein',
           #'first_nonzero_emp', 'init_liability', 'payperworker', 'prevaaemp','prevtot_wages_yr', 'run_num',  'tot_wages_yr','ui_acct',
           'naics_code', 'owner_code',
           ]
    mc = ['addr_edit', 'city', 'fipsstate']
    dft = merging(dfp, dfldb[
        list(set(ldbcols + mc)) + ['legal_edit', 'legal_edit_spaces', 'trade_edit', 'trade_edit_spaces', 'ldb_num']],
                  mc, mc, True, 'inner')
    dft['fuzzratio_legal'] = dft[['legal_edit', 'employer']].apply(lambda x: fuzz.partial_ratio(str(x[1]), str(x[0])), axis=1)
    cond = (dft['legal_edit'].isnull())
    dft['fuzzratio_legal'] = np.where(cond, np.NaN, dft['fuzzratio_legal'])
    dft['fuzzratio_trade'] = dft[['trade_edit', 'employer']].apply(lambda x: fuzz.partial_ratio(str(x[1]), str(x[0])),
                                                                   axis=1)
    cond = (dft['trade_edit'].isnull())
    dft['fuzzratio_trade'] = np.where(cond, np.NaN, dft['fuzzratio_trade'])
    dft['fuzzratio'] = dft[['fuzzratio_trade', 'fuzzratio_legal']].max(axis=1)
    # keep 1 ldb
    cond = (dft['fuzzratio'] > 69)
    dft1 = dft[cond].sort_values(['ID', 'fuzzratio']).drop_duplicates('ID', keep='last').sort_values(
        ['ldb_num', 'fuzzratio']).drop_duplicates('ldb_num', keep='last')
    cond = (dft1['fuzzratio_legal'] == dft1['fuzzratio'])
    dft1['employer_fuzzy'] = np.where(cond, dft1['legal_edit_spaces'], dft1['trade_edit_spaces'])
    dft1['ldb_cosine_fuzzy'] = .99
    dft1['Dindustry_fuzzy'] = ''
    dft1['fuzzy_type'] = 'Address Match'
    dft1['fuzzy_group'] = 'a'
    dft1.drop(columns=['legal_edit', 'legal_edit_spaces', 'trade_edit',
                       'trade_edit_spaces', 'fuzzratio_legal', 'fuzzratio_trade', 'fuzzratio'], inplace=True)
    df.append(dft1)
    #remove from dfs
    dfp, dfldb = paringdown(dft1, dfp, dfldb.copy())
    #looking for word overlaps
    dft['Dwordoverlap_trade'] = dft[['employer_edit', 'trade_edit_spaces']].apply(
        lambda x: 1 if intersection(str(x[0]).split(' '), str(x[1]).split(' ')) else 0, axis=1)
    dft['Dwordoverlap_legal'] = dft[['employer_edit', 'legal_edit_spaces']].apply(
        lambda x: 1 if intersection(str(x[0]).split(' '), str(x[1]).split(' ')) else 0, axis=1)
    cond = ((dft['Dwordoverlap_trade'] == 1) | (dft['Dwordoverlap_legal'] == 1))
    dft1 = dft[cond].sort_values(['ID', 'fuzzratio']).drop_duplicates('ID', keep='last').sort_values(
        ['ldb_num', 'fuzzratio']).drop_duplicates('ldb_num', keep='last')
    cond = (dft1['Dwordoverlap_legal'] == 1)
    dft1['employer_fuzzy'] = np.where(cond, dft1['legal_edit_spaces'], dft1['trade_edit_spaces'])
    dft1['ldb_cosine_fuzzy'] = .99
    dft1['Dindustry_fuzzy'] = ''
    dft1['fuzzy_type'] = 'Address Match'
    dft1['fuzzy_group'] = 'a'
    dft1.drop(columns=['legal_edit', 'legal_edit_spaces', 'trade_edit',
                       'trade_edit_spaces', 'fuzzratio_legal', 'fuzzratio_trade', 'fuzzratio', 'Dwordoverlap_trade',
                       'Dwordoverlap_legal'], inplace=True)
    df.append(dft1)
    #remove from dfs
    dfp, dfldb = paringdown(dft1, dfp, dfldb.copy())
    return(df, dfp, dfldb)

def ldbfuzzymerge(dft, dfldb):
    dft1 = []
    renamedict = {
        'trade_edit' : 'employer_fuzzy',
        'naics2dig' : 'naics2dig_ldb'
    }
    mc = ['employer_fuzzy', 'city', 'fipsstate']
    #only match if has a city match
    cond = (dft['fuzzy_group'] == 'a')
    dft1.append(merging(dfldb.rename(columns = renamedict)[['ldb_num', 'lat_ldb', "long_ldb", 'naics2dig_ldb'] + mc].drop_duplicates(mc), dft[cond], mc, mc, True, 'inner'))
    renamedict1 = {
        'employer_fuzzy' : 'trade_edit'
    }
    renamedict2 = {
        'legal_edit' : 'employer_fuzzy'
    }
    mc = ['employer_fuzzy', 'city', 'fipsstate']
    dft1.append(merging(dfldb.rename(columns = renamedict1).rename(columns = renamedict2)[['ldb_num', 'lat_ldb', "long_ldb", 'naics2dig_ldb'] + mc].drop_duplicates(mc), dft[cond], mc, mc, True, 'inner'))
    renamedict1 = {
        'employer_fuzzy' : 'legal_edit'
    }
    renamedict2 = {
        'trade_edit' : 'employer_fuzzy'
    }
    #only match if county match
    cond = (dft['fuzzy_group'] == 'b')
    mc = ['employer_fuzzy', 'fipscounty', 'fipsstate']
    dft1.append(merging(dfldb.rename(columns = renamedict1).rename(columns = renamedict2)[['ldb_num', 'lat_ldb', "long_ldb", 'naics2dig_ldb'] + mc].drop_duplicates(mc), dft[cond], mc, mc, True, 'inner'))
    renamedict1 = {
        'employer_fuzzy' : 'trade_edit'
    }
    renamedict2 = {
        'legal_edit' : 'employer_fuzzy'
    }
    mc = ['employer_fuzzy', 'fipscounty', 'fipsstate']
    dft1.append(merging(dfldb.rename(columns = renamedict1).rename(columns = renamedict2)[['ldb_num', 'lat_ldb', "long_ldb", 'naics2dig_ldb'] + mc].drop_duplicates(mc), dft[cond], mc, mc, True, 'inner'))
    renamedict1 = {
        'employer_fuzzy' : 'legal_edit'
    }
    renamedict2 = {
        'trade_edit' : 'employer_fuzzy'
    }
    #only match if state match
    cond = (dft['fuzzy_group'] == 'c')
    mc = ['employer_fuzzy', 'fipsstate']
    dft1.append(merging(dfldb.rename(columns = renamedict1).rename(columns = renamedict2)[['ldb_num', 'lat_ldb', "long_ldb", 'naics2dig_ldb'] + mc].drop_duplicates(mc), dft[cond], mc, mc, True, 'inner'))
    renamedict1 = {
        'employer_fuzzy' : 'trade_edit'
    }
    renamedict2 = {
        'legal_edit' : 'employer_fuzzy'
    }
    mc = ['employer_fuzzy', 'fipsstate']
    dft1.append(merging(dfldb.rename(columns = renamedict1).rename(columns = renamedict2)[['ldb_num', 'lat_ldb', "long_ldb", 'naics2dig_ldb'] + mc].drop_duplicates(mc), dft[cond], mc, mc, True, 'inner'))
    return(pd.concat(dft1, ignore_index=True))


def bestfuzzymerge(dft1, dfldb):
    '''
    keep best 1 match per ppp

    1) keep if sector match
    2) calculate distance
    3) choose closest
    '''
    dft1['num_ppp_match'] = dft1.groupby('ID')['employer'].transform('count')
    cond = (dft1['num_ppp_match'] == 1)
    dft1a = dft1[cond]
    dft1 = dft1[~cond]
    cond = (dft1['Dindustry_fuzzy'].notnull()) & (dft1['Dindustry_fuzzy'] != '')
    dft1['D_industry_match'] = np.where(cond, 1, 0)
    dft1['temp_industry'] = dft1.groupby('ID')['D_industry_match'].transform('max')
    kcond = (((dft1['temp_industry'] > 0) & (dft1['D_industry_match'] == 1)) | (dft1['temp_industry'] == 0))
    dft1 = dft1[kcond]

    def haversine(lon1, lat1, lon2, lat2):
        from math import radians, cos, sin, asin, sqrt
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * asin(sqrt(a))
        km = 6367 * c
        return km

    def distancecalc(x):
        if not x['long_ldb']:
            return (np.NaN)
        else:
            el = haversine(x['Longitude'], x['Latitude'], x['long_ldb'], x['lat_ldb'])
            return (el)
    try:
        dft1['distance_km'] = dft1.apply(distancecalc, axis=1)
        dft1a['distance_km'] = dft1a.apply(distancecalc, axis=1)
    except:
        dft1['distance_km'] = np.NaN
        dft1a['distance_km'] = np.NaN

    dft1 = dft1.sort_values(['ID', 'distance_km']).drop_duplicates('ID', keep='first')

    dft1 = pd.concat([dft1a, dft1], ignore_index=True)
    ldbcols = ['Number of Estab in EIN', 'aaemp',
           'ein',
           #'first_nonzero_emp', 'init_liability', 'payperworker', 'prevaaemp','prevtot_wages_yr', 'run_num',  'tot_wages_yr','ui_acct',
           'naics_code', 'owner_code',
           ]
    mc = ['ldb_num']
    dft1 = merging(dft1, dfldb[ldbcols + mc], mc, mc, True, 'inner')
    return(dft1)

def dedup_ldb(df):
    '''
    issues

    1) LDB matching to more than 1 establishment

    '''
    fuzzdict = {
        'Exact Address Match': 1, 'Address Match': 2, 'Exact Match': 3, 'Fuzzy Match': 4
    }
    df['rankfuzz'] = df['fuzzy_type'].map(fuzzdict)
    ###keep if in same group
    df['ldb_rankfuzz'] = df.groupby('ldb_num')['rankfuzz'].transform('min')
    df = df[df['ldb_rankfuzz']==df['rankfuzz']]
    #df = df.sort_values(['ldb_num','rankfuzz', ]).drop_duplicates('ldb_num', keep='first')

    #last check - dropping duplicates for ein AND distance so there's only one of a particular distance
    #df.drop_duplicates(['ein', 'distance_km'])
    return(df.sort_values(['ID', 'ein','rankfuzz', ]).drop_duplicates(['ID', 'ein'], keep='first'))



def dedup_ldb_eidl(df):
    '''
    issues

    1) LDB matching to more than 1 establishment

    '''
    # fuzzdict = {
    #     'Exact Address Match': 1, 'Address Match': 2, 'Exact Match': 3, 'Fuzzy Match': 4
    # }
    # df['rankfuzz'] = df['fuzzy_type'].map(fuzzdict)
    df = df.sort_values(['ldb_num', 'distance_km']).drop_duplicates('ldb_num', keep='first')
    df['ein_dist'] = df.groupby('ein')['distance_km'].transform('min')
    #if distance is higher than minimum distance, then get rid of it
    dropcond = (df['distance_km'] > df['ein_dist'])
    #last check - dropping duplicates for ein AND distance so there's only one of a particular distance
    return(df[~dropcond].drop_duplicates(['ein', 'distance_km']))

def oos_ppp_match(dfp, dfldb, pppkeepcols):

    # exact address matching

    df, dfp, dfldb = exactaddress(dfp, dfldb, pppkeepcols)
    # address matching
    df, dfp, dfldb = addressmatch(df, dfp.drop(columns=['_merge']), dfldb)

    df = pd.concat(df, ignore_index=True)
    df['distance_km'] = 0
    # deduplicate LDB
    df = dedup_ldb(df)

    # get no matches
    dlist = df['ID'].unique().tolist()
    cond = (dfp['ID'].isin(dlist))
    dfp = dfp[~cond]
    dfp['fuzzy_type'] = 'No Match'
    return(pd.concat([df, dfp,], ignore_index=True), dfldb)


def mainfile_eidl(st):
    dataloc1 = "/dataERS/eract/daltonm/pppfiles/"
    logging.exception('*******************')
    logging.exception(st)
    logging.exception('*******************')
    #open ppp
    #filename = 'eidlloan_' + st
    filename = 'eidlgrant_' + st
    dfp = opensplitnew('pppfiles/' + filename, [])
    #i believe this drops sole proprietors
    # the "P" group was getting very low match rate
    kcond = (dfp['BUSINESSTYPES'].isin(['R', 'RM']))
    dfp = dfp[kcond]
    #this list is for the fuzzy matches
    dftlist = []
    #this list is for the address matches
    dflistmain = []
    filename = 'ldb_' + st
    #dfldb = opensplitnew('pppfiles/' + filename, [])
    dfldb = pd.read_csv(dataloc + 'pppfiles/' + filename + '.csv', sep='|').rename(columns = {'Latitude' : "lat_ldb",
                                                                                              'Longitude' : 'long_ldb'})
    #keep ldb if private
    cond = (dfldb['owner_code'] == 5)
    dfldb = dfldb[cond]
    #get franchise info
    einlist = pd.read_csv(dataloc + 'sba_franchise_fullymerged.csv')['ein'].unique().tolist()
    cond = (dfldb['ein'].isin(einlist))
    dfldb['Dfranchise'] = np.where(cond, 1, 0)
    #clear
    dflist = []
    #exact address matching
    pppkeepcols = ['ACTIONDATE', 'ACTIONTYPE', 'ASSISTANCETYPE', 'AWARDDESC',
                   'AWARDMODIFICATIONAMENDMENTNUM', 'BUSINESSTYPES',
                   'FACEVALUEOFDIRECTLOANORLOANGUARANTEE', 'FAIN',
                   'FEDERALACTIONOBLIGATION', 'NONFEDERALFUNDINGAMOUNT',
                   'ORIGINALLOANSUBSIDYCOST', 'PERIODOFPERFORMANCESTARTDATE',
                   'SAI_NUM', 'employer_orig', 'Name of File', 'ID',
                   'Latitude', 'Longitude', 'Dzip_problem', ]
    df, dfp, dfldb = exactaddress(dfp, dfldb, pppkeepcols)
    #address matching
    df, dfp, dfldb = addressmatch(df, dfp, dfldb)
    logging.exception(st + ' exact matching done, now doing fuzzy matching')

    ####fuzzy matching on remaining
    ldbcols = ['trade_edit', 'legal_edit', 'city', 'fipsstate', 'fipscounty', ]

    pcols = ['employer', 'city', 'fipsstate', 'fipscounty', ]

    dft = stateloopfull(dfp[pcols].drop_duplicates(pcols), dfldb[ldbcols].drop_duplicates(ldbcols))
    logging.exception(st + ' fuzzy matching done, now doing final steps')
    #merge in ppp info
    mc = ['city', 'employer', 'fipscounty', 'fipsstate']
    dft = merging(dft.drop_duplicates(mc), dfp, mc, mc, True, 'left')

    # '''
    # testing
    # '''
    # print(dft['fuzzy_type'].value_counts())
    # print(dft.groupby('fuzzy_type')['BusTypDesc'].e_counts() / dft.groupby('fuzzy_type')['BusTypDesc'].count())
    # #print(df[0]['BusTypDesc'].value_counts() / len(df[0]))
    # #print(df[2]['BusTypDesc'].value_counts() / len(df[2]))
    # print(dft.groupby('fuzzy_type')['CURREMP'].sum())
    # #print(df['CURREMP'].sum())

    #merge in LDB info
    renamedict = {
        'naics2dig' : 'naics2dig_ldb'
    }
    dfldb.rename(columns = renamedict, inplace=True)
    dft = ldbfuzzymerge(dft, dfldb)
    dftlist.append(bestfuzzymerge(dft, dfldb))
    dflistmain.append(pd.concat(df, ignore_index=True))
    df = pd.concat(dflistmain + dftlist, ignore_index=True)

    #deduplicate LDB
    df = dedup_ldb_eidl(df)

    #get no matches
    dlist = df['ID'].unique().tolist()
    cond = (dfp['ID'].isin(dlist))
    dfp = dfp[~cond]
    dfp['fuzzy_type'] = 'No Match'

    # '''
    # testing
    # '''
    # df['test'] = df['CURREMP']  / df['aaemp']
    # df['test2'] = df['CURREMP']  - df['aaemp']
    # df['test'] = df['test'].replace([np.inf, -np.inf], np.NaN)
    # print(df.groupby(['fuzzy_type','fuzzy_group'])['test2'].describe())
    #
    # cond = (df['test2'] > 100)
    # samp(df[cond])
    #
    # print(df['CURREMP'].sum())
    # print(dfp['CURREMP'].sum())
    #
    # print(df['INITLAPPVAMT'].sum())
    # print(dfp['INITLAPPVAMT'].sum())

    df = pd.concat([df,dfp], ignore_index=True)
    return(df)

def sbafuzzymerge():
    dffranch = opensplitnew('sbafranchise', [])
    dffranch['city'] = 'no city'
    dffranch['fipsstate'] = 999
    #d/b/a and f/k/a are different ways to say "alterantive" names. this is only like 1% of franchises, but still useful to do
    dffranch['brand_edit'] = dffranch['BRAND:'].str.replace('d/b/a', '/').str.replace('f/k/a', '/').str.replace(' dba ', '/').str.replace(' fka ', '/')
    dffranch[['employer_edit','employer']] = dffranch[['brand_edit', 'city', 'fipsstate']].apply(lambda x: employeredit(x), axis=1, result_type='expand')
    dffranch['employer'] = dffranch['employer'].str.replace(" ", "")
    kcond = (dffranch['employer'] != '') & (dffranch['brand_edit'].notnull())
    dffranch = dffranch[kcond]
    #they sometimes lump together multiple names in here, split by '/'. The problem is some legit names inclue '/' so can't split this easily
    dft = dffranch['brand_edit'].str.split('/', expand=True)
    dft.columns = ['emp_' + str(i) for i in dft]
    ncols = len(dft.columns)
    #identify cases where one of the "naems" has fewer than 4 characters, and therefore probably should not be split
    for c in dft:
        cond = (((dft[c].apply(lambda x: len(str(x))) <= 2) & (dft[c].apply(lambda x: len(str(x))) > 0) & (dft['emp_1'].notnull())) | (dft['emp_1'].isnull()))
        dft['D'+c] = np.where(cond, 1, 0)

    dft['Dremove'] = dft[[c for c in dft if c.startswith('Demp')]].sum(axis=1)

    cond = (dft['Dremove'] > 0)
    for i in range(0,ncols):
        c = 'emp_' + str(i)
        dft[c] = np.where(cond, None, dft[c])

    dffranch = pd.concat([dffranch, dft], axis=1, ignore_index=False)
    franchcols = ['SBA FRANCHISE IDENTIFIER CODE:', 'BRAND:', 'MEETS FTC DEFINITION?:',
           'IS AN  ADDENDUM NEEDED?:', 'SBA ADDENDUM - Form 2462:',
           'SBA NEGOTIATED ADDENDUM:', 'SBA FRANCHISE IDENTIFIER CODE Start Date:',
           'NOTES:', 'city', 'fipsstate', 'brand_edit', 'employer_edit',
           'employer', 'emp_0', 'emp_1', 'emp_2', 'emp_3', 'emp_4', 'emp_5',
           'emp_6', 'emp_7', 'Demp_0', 'Demp_1', 'Demp_2', 'Demp_3', 'Demp_4',
           'Demp_5', 'Demp_6', 'Demp_7', 'Dremove']

    #get edited version of columns
    for i in range(0,ncols):
        c = 'emp_' + str(i)
        dffranch[['employer_edit_' + str(i), 'employer_' + str(i)]] = dffranch[[c, 'city', 'fipsstate']].apply(
            lambda x: employeredit(x), axis=1, result_type='expand')
        dffranch['employer_' + str(i)] = dffranch['employer_' + str(i)].str.replace(" ", "")

    dflist = []
    #concat other columns as rows
    for i in range(0, ncols):
        c = 'employer_' + str(i)
        cond = (dffranch[c].notnull()) & (dffranch[c] != 'none')
        dft = dffranch[cond]
        dft['employer'] = dft[c]
        dft['employer_edit'] = dft['employer_edit_' + str(i)]
        dflist.append(dft)

    dffranch = pd.concat([dffranch] + dflist, ignore_index=True)

    #ldb data
    year = 2019
    ldbcols = ['trade_edit', 'legal_edit', 'ein', 'ldb_num', 'Number of Estab in EIN']
    dfldborig = opensplitnew('ldb/' + str(year) + 'ldbv1', ldbcols)
    dfldborig['trade_nospaces'] = dfldborig['trade_edit'].apply(lambda x: re.sub(r"\s+", "", str(x), flags=re.UNICODE))
    dfldborig['legal_nospaces'] = dfldborig['legal_edit'].apply(lambda x: re.sub(r"\s+", "", str(x), flags=re.UNICODE))
    #get exact matches
    dfldborig.rename(columns = {'trade_nospaces' : 'employer'}, inplace=True)
    mc = ['employer']
    dffranch_exact1 = merging(dffranch, dfldborig, mc, mc, True, 'inner')
    dfldborig = dfldborig.rename(columns = {'employer' : 'trade_nospaces'}).rename(columns = {'legal_nospaces' : 'employer'})
    mc = ['employer']
    dffranch_exact2 = merging(dffranch, dfldborig, mc, mc, True, 'inner')
    dfldborig.rename(columns = {'employer' : 'legal_nospaces'}, inplace=True)
    dffranch_exact = pd.concat([dffranch_exact1, dffranch_exact2], ignore_index=True)
    exactmatches = dffranch_exact['employer'].unique().tolist()
    #nonexact match names for fuzzy match
    #start with those that have more than 1 establishment
    for i in ['trade', 'legal']:
        dfldborig['num_samename_' + i] = dfldborig.groupby(i + '_nospaces')['ldb_num'].transform('count')
        cond = ((dfldborig[i + '_nospaces'].isnull()) | (dfldborig[i + '_nospaces'] == ''))
        dfldborig['num_samename_' + i] = np.where(cond, 0, dfldborig['num_samename_' + i])
    #multiunit or common name
    cond = ((dfldborig['Number of Estab in EIN'] > 1) | (dfldborig['num_samename_trade'] > 1) | (dfldborig['num_samename_legal'] > 1))
    conda = (dfldborig['trade_nospaces'].isin(exactmatches))
    condb = (dfldborig['legal_nospaces'].isin(exactmatches))
    ldbnames = [i for i in set(dfldborig[cond& ~conda]['trade_nospaces'].unique().tolist() + dfldborig[cond&~condb]['legal_nospaces'].unique().tolist()) ]
    cond = (dffranch['employer'].isin(exactmatches))
    franchnames = [i for i in set(dffranch[~cond]['employer'].unique().tolist())]

    dff1 = dffranch[cond]
    dff1['Dexact'] = 1

    emplist = franchnames + ldbnames
    # make sure all strings
    emplist = [str(i) for i in emplist]
    # create sparse matrix
    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
    tf_idf_matrix = vectorizer.fit_transform(emplist)
    # get top 20 matches that meet cossine cutoff of 85.
    matches = cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 20, .80)
    # creates list of matches
    matches_df = get_matches_df(matches, emplist, top='')
    # some weird rounding error where need to do .999999. anything above is exact match
    matches_df = matches_df[matches_df['ldb_cosine_fuzzy'] < .9999999]
    # merge to bg to make sure only keep matches in BG
    matches_df = matches_df[matches_df['employer'].isin(franchnames)]
    # merge to ldb to make sure only keep if fuzzy match was in LDB
    matches_df = matches_df[matches_df['ldb_employer_fuzzy'].isin(ldbnames)]

    matches_df['Dfuzzy'] = 1

    savesplit(pd.concat([dff1,matches_df], ignore_index=True), 'sba_franchise_fuzzy_pt1')

    '''
    merge it all together again
    '''

    dff = opensplitnew('sba_franchise_fuzzy_pt1', [])
    #making consistent info for exact matches
    cond = (dff['Dexact'] == 1)
    dff['ldb_employer_fuzzy'] = np.where(cond, dff['employer'],dff['ldb_employer_fuzzy'] )
    dff['ldb_cosine_fuzzy'] = np.where(cond, 1, dff['ldb_cosine_fuzzy'])
    #merge in franchise data to fuzzy matches
    cond = (dff['Dfuzzy'] == 1)
    dff1 = dff[cond][['employer', 'ldb_employer_fuzzy', 'ldb_cosine_fuzzy', 'Dfuzzy']].rename(columns = {'employer' : 'employer_match'})
    mc = ['employer_match']
    dffranchcols = [i for i in dffranch if i not in mc]
    dfft = []
    cond = (dffranch['Dremove'] == 0)
    dfft.append(merging(dff1, dffranch[~cond].rename(columns={'employer': 'employer_match'}).drop_duplicates(
        ['SBA FRANCHISE IDENTIFIER CODE:', 'employer_match']), mc, mc, True, 'inner'))
    for i in range(0, ncols):
        c = 'employer_' + str(i)
        dfft.append(merging(dff1, dffranch[cond].rename(columns = {c : 'employer_match'}).drop_duplicates(['SBA FRANCHISE IDENTIFIER CODE:','employer_match']), mc,mc,True, 'inner'))

    dfft = pd.concat(dfft, ignore_index= True)

    dfft['tempratio'] = dfft[['employer_match','ldb_employer_fuzzy']].apply(lambda x: fuzz.partial_ratio(x[0], x[1]), axis=1)

    cond = (dff['Dexact'] == 1) & (dff['ldb_employer_fuzzy'] != '')
    condb = ((dfft['ldb_cosine_fuzzy'] > .90) | ((dfft['ldb_cosine_fuzzy'] > .81) & (dfft['tempratio'] == 100))) & (dfft['ldb_employer_fuzzy'] != '')
    dff = pd.concat([dff[cond], dfft[condb]], ignore_index=True)

    dff = dff.sort_values(['SBA FRANCHISE IDENTIFIER CODE:', 'ldb_cosine_fuzzy']).drop_duplicates('SBA FRANCHISE IDENTIFIER CODE:', keep='last')


    kcols = ['SBA FRANCHISE IDENTIFIER CODE:', 'BRAND:', 'MEETS FTC DEFINITION?:',
           'IS AN  ADDENDUM NEEDED?:', 'SBA ADDENDUM - Form 2462:',
           'SBA NEGOTIATED ADDENDUM:', 'SBA FRANCHISE IDENTIFIER CODE Start Date:',
           'NOTES:', 'employer_edit',
           'employer', 'Dexact',
           'ldb_employer_fuzzy', 'ldb_cosine_fuzzy', 'Dfuzzy', 'employer_match',
           'tempratio']
    ldbcols = ['trade_edit', 'legal_edit', 'ein', 'ldb_num', 'Number of Estab in EIN',
           'trade_nospaces', 'legal_nospaces']

    dff1 = []
    mc = ['ldb_employer_fuzzy']
    for i in ['trade', 'legal']:
        renamedict = {i + '_nospaces' : mc[0]}
        dff1.append(merging(dff[kcols], dfldborig.rename(columns = renamedict), mc,mc,True,'inner'))

    dff1 = pd.concat(dff1, ignore_index=True)

    dcols = ['trade_edit', 'legal_edit', 'employer_edit', 'employer', 'legal_nospaces',
             'trade_nospaces', 'Dexact', 'Dfuzzy', 'employer_match', 'tempratio']
    savesplit(dff1.drop(columns = dcols), 'sba_franchise_fullymerged')
    dff1.drop(columns=dcols).to_csv(dataloc + 'sba_franchise_fullymerged.csv')
    return


'''
ppp match
'''

try:
    dfg = pd.read_excel(dataloc + 'geography/' + 'commuting_zones.xls')
    dfg['fipscounty'] = dfg['FIPS'].apply(lambda x: float(str(x)[-3:]))
    dfg['fipsstate'] = dfg['FIPS'].apply(lambda x: float(str(x)[:-3]))
    rdict = {
        'Commuting Zone ID, 2000' : 'cz',
    }
    mc = ['fipsstate', 'fipscounty']
    kcols = mc + ['cz']
    dfg = dfg.rename(columns = rdict)[kcols].drop_duplicates(subset = mc)
    czlist = dfg['cz'].unique().tolist()
    if filename.endswith('temp'):
        czlist = ['469']
    else:
        #alleidledits()
        logging.exception('files in place, now matching')

    '''
    make edits of ppp data
    '''
    #allpppedits()
    #ldbedits()

    for pppyear in [2020,
                    2021]:

        def mainfile(cz, pppyear=pppyear):
            try:
                logging.exception('*******************')
                logging.exception(cz)
                logging.exception('*******************')
                # open ppp
                cz = str(int(cz))
                filename = 'ppp_' + cz
                if (os.path.exists(dataloc + 'pppfiles/cz/' + filename + '.psv')) & (
                os.path.exists(dataloc + 'pppfiles/ldb/cz/' + 'ldb_' + cz + '.psv')):
                    dfp = pd.read_csv(dataloc + 'pppfiles/cz/' + filename + '.psv', sep='|')
                    trenamedict = {'Lender': 'ServicingLenderName',
                                   'LoanAmount': 'CurrentApprovalAmount',
                                   'Zip': 'BorrowerZip',
                                   'RaceEthnicity': 'Race',
                                   'BusinessName': 'BorrowerName',
                                   'City': 'BorrowerCity',
                                   # 'NonProfit' : '',
                                   'Address': 'BorrowerAddress'
                                   }
                    # got it backwards
                    trenamedict = {v: k for k, v in trenamedict.items()}
                    dfp.rename(columns=trenamedict, inplace=True)

                    ##keeping only first round of PPP
                    dfp['loanyear'] = pd.to_datetime(dfp['DateApproved']).dt.year
                    kcond = (dfp['loanyear'] == pppyear)
                    dfp = dfp[kcond]
                    # keep if NOT in these type of businesses
                    kcond = (dfp['BusinessType'].isin(['Self-Employed Individuals',
                                                       'Independent Contractors', 'Sole Proprietorship']))
                    # keep if NOT non-profit with naics of religious org
                    kcond = (kcond | ((dfp['BusinessType'] == 'Non-Profit Organization') & (dfp['NAICSCode'] == 813110)))
                    dfpoos = dfp[kcond]
                    dfp = dfp[~kcond]


                    # this list is for the fuzzy matches
                    dftlist = []
                    # this list is for the address matches
                    dflistmain = []
                    filename = 'ldb_' + cz
                    # dfldb = opensplitnew('pppfiles/' + filename, [])
                    dfldb = pd.read_csv(dataloc + 'pppfiles/ldb/cz/' + filename + '.psv', sep='|').rename(
                        columns={'Latitude': "lat_ldb",
                                 'Longitude': 'long_ldb'})
                    # get franchise info
                    einlist = pd.read_csv(dataloc + 'sba_franchise_fullymerged.csv')['ein'].unique().tolist()
                    cond = (dfldb['ein'].isin(einlist))
                    dfldb['Dfranchise'] = np.where(cond, 1, 0)
                    einlist = None
                    # keep ldb if private
                    cond = (dfldb['owner_code'] == 5)
                    dfldb = dfldb[cond]
                    # exact address matching
                    pppkeepcols = ['Address', 'BusinessName', 'BusinessType', 'CD', 'City', 'DateApproved', 'Gender',
                                   'JobsReported', 'Lender',
                                   'LoanAmount', 'NAICSCode', 'Name of File', 'RaceEthnicity', 'State', 'Veteran',
                                   'Zip', 'st', 'ID',
                                   'Latitude', 'Longitude', 'Dzip_problem',
                                   'naics2dig', 'employer_orig',
                                   'ForgivenessAmount', 'ForgivenessDate', 'UndisbursedAmount', 'PAYROLL_PROCEED',
                                   ]
                    if (len(dfp) > 0) & (len(dfldb) > 0):
                        dfpo = dfp
                        # pcols1 = [ 'fipsstate', 'fipscounty',     'cz', ]
                        # pcols2 = [  'addr_edit', 'city', 'employer_edit',]
                        # dfpo[dfpo['ID'].isin(tlist)][pcols2].describe()
                        # dfpo[dfpo['ID'].isin(tlist)][pcols1]
                        # dfpo[dfpo['ID'].isin(tlist)][pcols2]
                        df, dfp, dfldb = exactaddress(dfp, dfldb, pppkeepcols)
                        if (len(dfp)>0) & (len(dfldb)>0):
                            # #address matching
                            df, dfp, dfldb = addressmatch(df, dfp.drop(columns=['_merge']), dfldb)
                            logging.exception(cz + ' exact matching done, now doing fuzzy matching')

                        ####fuzzy matching on remaining
                        ldbcols = ['trade_edit', 'legal_edit', 'city', 'fipsstate', 'fipscounty', 'naics2dig',
                                   'employer_first', 'cz']

                        pcols = ['employer', 'city', 'fipsstate', 'fipscounty', 'naics2dig', 'employer_first', 'cz']
                        dfp['employer_first'] = dfp['employer_edit'].apply(lambda x: str(x).split(' ')[0])
                        #########new function for out of scope units - focused on only HIGH quality matches
                        df1, dfldb = oos_ppp_match(dfpoos, dfldb, pppkeepcols)
                        if (len(dfp) > 0) & (len(dfldb) > 0):
                            dft = stateloopfull(dfp[pcols].drop_duplicates(pcols), dfldb[ldbcols].drop_duplicates(ldbcols), cz)
                            logging.exception(cz + 'fuzzy matching done, now doing final steps')
                            # merge in ppp info
                            mc = ['city', 'employer', 'fipscounty', 'cz', 'fipsstate']
                            dft = dft.drop_duplicates(mc).merge(dfp, on=mc, how='left')

                            # merge in LDB info
                            renamedict = {
                                'naics2dig': 'naics2dig_ldb'
                            }
                            dfldb.rename(columns=renamedict, inplace=True)
                            renamedict = {
                                'ldb_employer_fuzzy': 'employer_fuzzy'
                            }
                            dft.rename(columns=renamedict, inplace=True)
                            # puts all fuzzy ldb matches together
                            dft = ldbfuzzymerge(dft, dfldb)
                            dftlist.append(bestfuzzymerge(dft, dfldb))
                        dflistmain.append(pd.concat(df, ignore_index=True))
                        df = pd.concat(dflistmain + dftlist, ignore_index=True)
                        # deduplicate LDB
                        df = dedup_ldb(df)

                        # get no matches
                        dlist = df['ID'].unique().tolist()
                        cond = (dfpo['ID'].isin(dlist))
                        dfp = dfpo[~cond]
                        dfp['fuzzy_type'] = 'No Match'


                        # '''
                        # testing
                        # '''
                        # df['test'] = df['CURREMP']  / df['aaemp']
                        # df['test2'] = df['CURREMP']  - df['aaemp']
                        # df['test'] = df['test'].replace([np.inf, -np.inf], np.NaN)
                        # print(df.groupby(['fuzzy_type','fuzzy_group'])['test2'].describe())
                        #
                        # cond = (df['test2'] > 100)
                        # samp(df[cond])
                        #
                        # print(df['CURREMP'].sum())
                        # print(dfp['CURREMP'].sum())
                        #
                        # print(df['INITLAPPVAMT'].sum())
                        # print(dfp['INITLAPPVAMT'].sum())

                        df = pd.concat([df, dfp,  df1
                                        ], ignore_index=True)
                        dcols = ['Number of Estab in EIN', 'Address',
                                 'BusinessName',
                                 'Unnamed: 0', 'Unnamed: 0.1', 'SBAOfficeCode', 'ProcessingMethod',
                                 'BorrowerState', 'LoanStatusDate', 'LoanStatus', 'Term',
                                 'SBAGuarantyPercentage', 'InitialApprovalAmount',
                                 'ServicingLenderLocationID', 'ServicingLenderAddress',
                                 'ServicingLenderCity', 'ServicingLenderState', 'ServicingLenderZip',
                                 'RuralUrbanIndicator', 'HubzoneIndicator', 'LMIIndicator',
                                 'BusinessAgeDescription', 'ProjectCity', 'ProjectCountyName',
                                 'ProjectState', 'ProjectZip',
                                 'OriginatingLenderLocationID', 'st_cz', 'loanyear', '_merge']

                        return (df.drop(columns=dcols))
                    else:
                        return
                else:
                    return
            except:
                logging.exception('this cz screwed up: '+str(cz))
                print('this cz screwed up: ' + str(cz))
                return

        '''
        the matching
        '''
        from multiprocessing.dummy import Pool as ThreadPool
        import multiprocessing as mp
        from multiprocessing import Pool
        from functools import partial
        procs = 12
        #mainfilepart = partial(mainfile)
        with Pool(procs) as p:
            df = p.map(mainfile, czlist)
        df = pd.concat(df, ignore_index=True)

        #savesplit(df, 'pppmatched')
        #this is for transferring
        df.to_csv(dataloc + 'ppp'+str(pppyear)+'matched.csv', sep= '|')

    '''
    test
    '''
    # logging.debug(df.groupby(['fuzzy_type','BusTypDesc'])['CURREMP'].sum() / df.groupby(['fuzzy_type'])['CURREMP'].sum())
    # logging.debug(df.groupby(['fuzzy_type', 'BusTypDesc'])['INITLAPPVAMT'].sum() / df.groupby(['fuzzy_type'])['INITLAPPVAMT'].sum())
    # logging.debug(df.groupby(['fuzzy_type'])['CURREMP'].sum() / df['CURREMP'].sum())
    # logging.debug(df.groupby(['fuzzy_type'])['INITLAPPVAMT'].sum() / df['INITLAPPVAMT'].sum())
    goodemail(filename)


except Exception:
    logging.exception('fatal error')
    msg = 'fail: '
    bademail(filename)

'''
eidl merge
'''
#
# try:
#     statefips = pd.read_pickle(geog + 'statefips.pkl')
#     if filename.endswith('temp'):
#         stateabbrev = ['RI']
#     else:
#         #alleidledits()
#         logging.exception('files in place, now matching')
#         stateabbrev = ['CA_1', 'DE', 'CA_3',  'ND', 'TX', 'DC', 'NY',  'WV', 'FL','AL','NE', 'PA', 'AK', 'NJ',
#                        'NV', 'AZ', 'NH', 'CA_2', 'AR','VA','MI',
#                         'NM', 'CO', 'CT', 'NC',
#                        'OH', 'OK', 'GA', 'OR', 'HI', 'ID', 'IL', 'RI', 'IN', 'SC', 'IA', 'SD',
#                        'KS', 'TN', 'KY',  'LA', 'UT', 'ME', 'VT', 'MD', 'MA', 'WA', 'MN', 'MS', 'WI', 'MO',
#                        'WY', 'MT', ]
#         # stateabbrev = ['CA', 'DE', 'TX', 'DC', 'NY',  'WV', 'FL','AL','NE', 'PA', 'AK', 'NJ',
#         #                'NV', 'AZ', 'NH', 'AR','VA','MI',
#         #                 'NM', 'CO', 'CT', 'NC',
#         #                'OH', 'OK', 'GA', 'OR', 'HI', 'ID', 'IL', 'RI', 'IN', 'SC', 'IA', 'SD',
#         #                'KS', 'TN', 'KY',  'LA', 'UT', 'ME', 'VT', 'MD', 'MA', 'WA', 'MN', 'MS', 'WI', 'MO',
#         #                'WY', 'MT', 'ND',]
#
#     '''
#     the matching
#     '''
#     from multiprocessing.dummy import Pool as ThreadPool
#     import multiprocessing as mp
#     from multiprocessing import Pool
#     from functools import partial
#     procs = 20
#     #mainfilepart = partial(mainfile)
#     with Pool(procs) as p:
#         df = p.map(mainfile_eidl, stateabbrev)
#     df = pd.concat(df, ignore_index=True)
#     #dealing with CA repeats
#     cond = (df['fipsstate'] == stateinfo('CA', 'fips'))
#     dft = df[cond]
#     tdict = {'Exact Address Match' : 1, 'Exact Match' : 3, 'Address Match' : 2,
#        'Fuzzy Match' : 4, 'No Match' : 5}
#     dft['trank'] = dft['fuzzy_type'].map(tdict)
#     df = pd.concat([df[~cond], dft.sort_values(['ID', 'trank']).drop_duplicates(['ID'], keep='first')], ignore_index=True)
#     #savesplit(df, 'eidlloanmatched')
#     savesplit(df, 'eidlgrantmatched')
#     #this is for transferring
#     #df.to_csv(dataloc + 'eidlloanmatched.csv', sep= '|')
#     df.to_csv(dataloc + 'eidlgrantmatched.csv', sep= '|')
#
#     '''
#     test
#     '''
#     # logging.debug(df.groupby(['fuzzy_type','BusTypDesc'])['CURREMP'].sum() / df.groupby(['fuzzy_type'])['CURREMP'].sum())
#     # logging.debug(df.groupby(['fuzzy_type', 'BusTypDesc'])['INITLAPPVAMT'].sum() / df.groupby(['fuzzy_type'])['INITLAPPVAMT'].sum())
#     # logging.debug(df.groupby(['fuzzy_type'])['CURREMP'].sum() / df['CURREMP'].sum())
#     # logging.debug(df.groupby(['fuzzy_type'])['INITLAPPVAMT'].sum() / df['INITLAPPVAMT'].sum())
#     goodemail(filename)
# except Exception:
#     logging.exception('fatal error')
#     msg = 'fail: '
#     bademail(filename)





'''
sba franchise merge
'''
#
#
# try:
#     logging.exception('this is for doing the franchise merge')
#     sbafuzzymerge()
#
# except Exception:
#     logging.exception('fatal error')
#     msg = 'fail: '
#     bademail(filename)

'''
PPP match remaining
'''
# try:
#     statefips = pd.read_pickle(geog + 'statefips.pkl')
#     if filename.endswith('temp'):
#         stateabbrev = ['CA_2', 'RI']
#     else:
#         #alleidledits()
#         logging.exception('files in place, now matching')
#         stateabbrev = ['CA_1', 'DE', 'CA_3',  'ND', 'TX', 'DC', 'NY',  'WV', 'FL','AL','NE', 'PA', 'AK', 'NJ',
#                        'NV', 'AZ', 'NH', 'CA_2', 'AR','VA','MI',
#                         'NM', 'CO', 'CT', 'NC',
#                        'OH', 'OK', 'GA', 'OR', 'HI', 'ID', 'IL', 'RI', 'IN', 'SC', 'IA', 'SD',
#                        'KS', 'TN', 'KY',  'LA', 'UT', 'ME', 'VT', 'MD', 'MA', 'WA', 'MN', 'MS', 'WI', 'MO',
#                        'WY', 'MT', ]
#         # stateabbrev = ['CA', 'DE', 'TX', 'DC', 'NY',  'WV', 'FL','AL','NE', 'PA', 'AK', 'NJ',
#         #                'NV', 'AZ', 'NH', 'AR','VA','MI',
#         #                 'NM', 'CO', 'CT', 'NC',
#         #                'OH', 'OK', 'GA', 'OR', 'HI', 'ID', 'IL', 'RI', 'IN', 'SC', 'IA', 'SD',
#         #                'KS', 'TN', 'KY',  'LA', 'UT', 'ME', 'VT', 'MD', 'MA', 'WA', 'MN', 'MS', 'WI', 'MO',
#         #                'WY', 'MT', 'ND',]
#
#     '''
#     the matching
#     '''
#     from multiprocessing.dummy import Pool as ThreadPool
#     import multiprocessing as mp
#     from multiprocessing import Pool
#     from functools import partial
#     procs = 20
#     #mainfilepart = partial(mainfile)
#     with Pool(procs) as p:
#         df = p.map(mainfile, stateabbrev)
#     df = pd.concat(df, ignore_index=True)
#     #dealing with CA repeats
#     cond = (df['fipsstate'] == stateinfo('CA', 'fips'))
#     dft = df[cond]
#     tdict = {'Exact Address Match' : 1, 'Exact Match' : 3, 'Address Match' : 2,
#        'Fuzzy Match' : 4, 'No Match' : 5}
#     dft['trank'] = dft['fuzzy_type'].map(tdict)
#     df = pd.concat([df[~cond], dft.sort_values(['ID', 'trank']).drop_duplicates(['ID'], keep='first')], ignore_index=True)
#     savesplit(df, 'pppmatched')
#     #this is for transferring
#     df.to_csv(dataloc + 'pppmatched.csv', sep= '|')
#
# except Exception:
#     logging.exception('fatal error')
#     msg = 'fail: '
#     bademail(filename)
'''
zip files
'''
import zipfile
dataloc = "/dataERS/eract/daltonm/"
zfile = zipfile.ZipFile(dataloc + "ppp_files.zip", mode='w',)
for f in ['eidlloanmatched','eidlgrantmatched', 'ppp2020matched', 'ppp2021matched']:
    # Write to zip file
    zfile.write(dataloc + f + ".csv", arcname=f + ".csv")

zfile.close()

